Skip to content

Commit 5748e86

Browse files
committed
Introduce LocalScore CLI
1 parent a9658c7 commit 5748e86

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+4696
-1
lines changed

Makefile

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ include llamafile/BUILD.mk
1616
include llama.cpp/BUILD.mk
1717
include stable-diffusion.cpp/BUILD.mk
1818
include whisper.cpp/BUILD.mk
19+
include localscore/BUILD.mk
1920

2021
# the root package is `o//` by default
2122
# building a package also builds its sub-packages
@@ -24,6 +25,7 @@ o/$(MODE)/: o/$(MODE)/llamafile \
2425
o/$(MODE)/llama.cpp \
2526
o/$(MODE)/stable-diffusion.cpp \
2627
o/$(MODE)/whisper.cpp \
28+
o/$(MODE)/localscore \
2729
o/$(MODE)/third_party \
2830
o/$(MODE)/depend.test
2931

@@ -43,6 +45,7 @@ install: llamafile/zipalign.1 \
4345
o/$(MODE)/llama.cpp/imatrix/imatrix \
4446
o/$(MODE)/llama.cpp/quantize/quantize \
4547
o/$(MODE)/llama.cpp/llama-bench/llama-bench \
48+
o/$(MODE)/localscore/localscore \
4649
o/$(MODE)/llama.cpp/perplexity/perplexity \
4750
o/$(MODE)/llama.cpp/llava/llava-quantize \
4851
o/$(MODE)/stable-diffusion.cpp/main \
@@ -55,6 +58,7 @@ install: llamafile/zipalign.1 \
5558
$(INSTALL) o/$(MODE)/llama.cpp/imatrix/imatrix $(PREFIX)/bin/llamafile-imatrix
5659
$(INSTALL) o/$(MODE)/llama.cpp/quantize/quantize $(PREFIX)/bin/llamafile-quantize
5760
$(INSTALL) o/$(MODE)/llama.cpp/llama-bench/llama-bench $(PREFIX)/bin/llamafile-bench
61+
$(INSTALL) o/$(MODE)/localscore/localscore $(PREFIX)/bin/localscore
5862
$(INSTALL) build/llamafile-convert $(PREFIX)/bin/llamafile-convert
5963
$(INSTALL) build/llamafile-upgrade-engine $(PREFIX)/bin/llamafile-upgrade-engine
6064
$(INSTALL) o/$(MODE)/llama.cpp/perplexity/perplexity $(PREFIX)/bin/llamafile-perplexity

RELEASE.md

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
# Making a Llamafile Release
2+
3+
There are a few steps in making a Llamafile release which will be detailed in this document.
4+
5+
The two primary artifacts of the release are the `llamafile-<version>.zip` and the binaries for the GitHub release.
6+
7+
## Release Process
8+
9+
Note: Step 2 and 3 are only needed if you are making a new release of the ggml-cuda.so and ggml-rocm.so shared libraries. You only need to do this when you are making changes to the CUDA code or the API's surrounding it. Otherwise you can use the previous release of the shared libraries.
10+
11+
1. Update the version number in `version.h`
12+
2. Build the ggml-cuda.so and ggml-rocm.so shared libraries on Linux. You need to do this for Llamafile and LocalScore. Llamafile uses TINYBLAS as a default and LocalScore uses CUBLAS as a default for CUDA.
13+
- For Llamafile you can do this by running the script `./llamafile/cuda.sh` and `./llamafile/rocm.sh` respectively.
14+
- For LocalScore you can do this by running the script `./localscore/cuda.sh`.
15+
- The files will be built and placed your home directory.
16+
3. Build the ggml-cuda.dll and ggml-rocm.dll shared libraries on Windows. You need to do this for Llamafile and LocalScore.
17+
- You can do this by running the script `./llamafile/cuda.bat` and `./llamafile/rocm.bat` respectively.
18+
- For LocalScore you can do this by running the script `./localscore/cuda.bat`.
19+
- The files will be built and placed in the `build/release` directory.
20+
4. Build the project with `make -j8`
21+
5. Install the built project to your /usr/local/bin directory with `sudo make install PREFIX=/usr/local`
22+
23+
### Llamafile Release Zip
24+
25+
The easiest way to create the release zip is to:
26+
27+
`make install PREFIX=<preferred_dir>/llamafile-<version>`
28+
29+
After the directory is created, you will want to bundle the built shared libraries into the following release binaries:
30+
31+
- `llamafile`
32+
- `localscore`
33+
- `whisperfile`
34+
35+
You can do this for each binary with a command like the following:
36+
37+
Note: You MUST put the shared libraries in the same directory as the binary you are creating.
38+
39+
For llamafile and whisperfile you can do the following:
40+
41+
`zipalign -j0 llamafile ggml-cuda.so ggml-rocm.so ggml-cuda.dll ggml-rocm.dll`
42+
`zipalign -j0 whisperfile ggml-cuda.so ggml-rocm.so ggml-cuda.dll ggml-rocm.dll`
43+
44+
After doing this, delete the ggml-cuda.so and ggml-cuda.dll files from the directory, and copy + rename the ggml-cuda.localscore.so and ggml-cuda.localscore.dll files to the directory.
45+
46+
```
47+
rm <path_to>/llamafile-<version>/bin/ggml-cuda.so <path_to>/llamafile-<version>/bin/ggml-cuda.dll
48+
cp ~/ggml-cuda.localscore.so <path_to>/llamafile-<version>/bin/ggml-cuda.so
49+
cp ~/ggml-cuda.localscore.dll <path_to>/llamafile-<version>/bin/ggml-cuda.dll
50+
```
51+
52+
For localscore you can now package it:
53+
54+
`zipalign -j0 localscore ggml-cuda.so ggml-rocm.so ggml-cuda.dll ggml-rocm.dll`
55+
56+
After you have done this for all the binaries, you will want to get the existing PDFs (from the prior release) and add them to the directory:
57+
58+
`cp <path_to>/doc/*.pdf <path_to>/llamafile-<version>/share/doc/llamafile/`
59+
60+
The zip is structured as follows.
61+
62+
```
63+
llamafile-<version>
64+
|-- README.md
65+
|-- bin
66+
| |-- llamafile
67+
| |-- llamafile-bench
68+
| |-- llamafile-convert
69+
| |-- llamafile-imatrix
70+
| |-- llamafile-perplexity
71+
| |-- llamafile-quantize
72+
| |-- llamafile-tokenize
73+
| |-- llamafile-upgrade-engine
74+
| |-- llamafiler
75+
| |-- llava-quantize
76+
| |-- localscore
77+
| |-- sdfile
78+
| |-- whisperfile
79+
| `-- zipalign
80+
`-- share
81+
|-- doc
82+
| `-- llamafile
83+
| |-- llamafile-imatrix.pdf
84+
| |-- llamafile-perplexity.pdf
85+
| |-- llamafile-quantize.pdf
86+
| |-- llamafile.pdf
87+
| |-- llamafiler.pdf
88+
| |-- llava-quantize.pdf
89+
| |-- whisperfile.pdf
90+
| `-- zipalign.pdf
91+
`-- man
92+
`-- man1
93+
|-- llamafile-imatrix.1
94+
|-- llamafile-perplexity.1
95+
|-- llamafile-quantize.1
96+
|-- llamafile.1
97+
|-- llamafiler.1
98+
|-- llava-quantize.1
99+
|-- whisperfile.1
100+
`-- zipalign.1
101+
```
102+
103+
Before you zip the directory, you will want to remove the shared libraries from the directory.
104+
105+
`rm *.so *.dll`
106+
107+
You can zip the directory with the following command:
108+
109+
`zip -r llamafile-<version>.zip llamafile-<version>`
110+
111+
### Llamafile Release Binaries
112+
113+
After you have built the zip it is quite easy to create the release binaries.
114+
115+
The following binaries are part of the release:
116+
117+
- `llamafile`
118+
- `llamafile-bench`
119+
- `llamafiler`
120+
- `sdfile`
121+
- `localscore`
122+
- `whisperfile`
123+
- `zipalign`
124+
125+
You can use the script to create the appropriately named binaries:
126+
127+
`./llamafile/release.sh -v <version> -s <source_dir> -d <dest_dir>`
128+
129+
Make sure to move the llamafile-<version>.zip file to the <dest_dir> as well, and you are good to release after you've tested.

llama.cpp/BUILD.mk

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,4 +93,4 @@ o/$(MODE)/llama.cpp: \
9393
o/$(MODE)/llama.cpp/imatrix \
9494
o/$(MODE)/llama.cpp/quantize \
9595
o/$(MODE)/llama.cpp/perplexity \
96-
o/$(MODE)/llama.cpp/llama-bench
96+
o/$(MODE)/llama.cpp/llama-bench \

llama.cpp/ggml-cuda.cu

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17266,6 +17266,18 @@ GGML_CALL int ggml_backend_cuda_get_device_count() {
1726617266
return ggml_cuda_info().device_count;
1726717267
}
1726817268

17269+
GGML_CALL void ggml_backend_cuda_get_device_properties(int device, struct ggml_cuda_device_properties * properties) {
17270+
cudaDeviceProp prop;
17271+
CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
17272+
17273+
strncpy(properties->name, prop.name, sizeof(properties->name) - 1);
17274+
properties->totalGlobalMem = prop.totalGlobalMem;
17275+
properties->multiProcessorCount = prop.multiProcessorCount;
17276+
properties->major = prop.major;
17277+
properties->minor = prop.minor;
17278+
snprintf(properties->compute, sizeof(properties->compute), "%d.%d", prop.major, prop.minor);
17279+
}
17280+
1726917281
GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size) {
1727017282
cudaDeviceProp prop;
1727117283
CUDA_CHECK(cudaGetDeviceProperties(&prop, device));

llama.cpp/ggml-cuda.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,15 @@ extern "C" {
2020

2121
#define GGML_CUDA_MAX_DEVICES 16
2222

23+
struct ggml_cuda_device_properties {
24+
char name[256];
25+
size_t totalGlobalMem;
26+
int multiProcessorCount;
27+
int major;
28+
int minor;
29+
char compute[8];
30+
};
31+
2332
GGML_API GGML_CALL bool ggml_cuda_link(const struct ggml_backend_api * backend_api);
2433

2534
// backend API
@@ -37,6 +46,7 @@ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_typ
3746
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
3847

3948
GGML_API GGML_CALL int ggml_backend_cuda_get_device_count(void);
49+
GGML_API GGML_CALL void ggml_backend_cuda_get_device_properties(int device, struct ggml_cuda_device_properties * properties);
4050
GGML_API GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
4151
GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
4252

llama.cpp/ggml-metal.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,15 @@ struct ggml_cgraph;
3535
extern "C" {
3636
#endif
3737

38+
struct ggml_metal_device_properties {
39+
char name[256];
40+
float memory;
41+
int core_count;
42+
int metal_version;
43+
int gpu_family;
44+
int gpu_family_common;
45+
};
46+
3847
void ggml_metal_link(const struct ggml_backend_api *);
3948

4049
//
@@ -56,6 +65,10 @@ GGML_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml
5665

5766
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
5867

68+
GGML_API void ggml_backend_metal_get_device_properties(ggml_backend_t backend, struct ggml_metal_device_properties * properties);
69+
70+
GGML_API void ggml_backend_metal_get_device_memory_usage(ggml_backend_t backend, float * used, float * total);
71+
5972
// helper to check if the device supports a specific family
6073
// ideally, the user code should be doing these checks
6174
// ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf

llama.cpp/ggml-metal.m

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -304,6 +304,10 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) {
304304
id<MTLDevice> device;
305305
id<MTLCommandQueue> queue;
306306

307+
int family;
308+
int family_common;
309+
int metal_version;
310+
307311
dispatch_queue_t d_queue;
308312

309313
struct ggml_metal_kernel kernels[GGML_METAL_KERNEL_TYPE_COUNT];
@@ -507,20 +511,23 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
507511
for (int i = MTLGPUFamilyApple1 + 20; i >= MTLGPUFamilyApple1; --i) {
508512
if ([ctx->device supportsFamily:i]) {
509513
GGML_METAL_LOG_INFO("%s: GPU family: MTLGPUFamilyApple%d (%d)\n", __func__, i - (int) MTLGPUFamilyApple1 + 1, i);
514+
ctx->family = i - MTLGPUFamilyApple1 + 1;
510515
break;
511516
}
512517
}
513518

514519
for (int i = MTLGPUFamilyCommon1 + 5; i >= MTLGPUFamilyCommon1; --i) {
515520
if ([ctx->device supportsFamily:i]) {
516521
GGML_METAL_LOG_INFO("%s: GPU family: MTLGPUFamilyCommon%d (%d)\n", __func__, i - (int) MTLGPUFamilyCommon1 + 1, i);
522+
ctx->family_common = i - MTLGPUFamilyCommon1 + 1;
517523
break;
518524
}
519525
}
520526

521527
for (int i = MTLGPUFamilyMetal3 + 5; i >= MTLGPUFamilyMetal3; --i) {
522528
if ([ctx->device supportsFamily:i]) {
523529
GGML_METAL_LOG_INFO("%s: GPU family: MTLGPUFamilyMetal%d (%d)\n", __func__, i - (int) MTLGPUFamilyMetal3 + 3, i);
530+
ctx->metal_version = i - MTLGPUFamilyMetal3 + 3;
524531
break;
525532
}
526533
}
@@ -3367,6 +3374,27 @@ void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_ca
33673374
ctx->abort_callback_data = user_data;
33683375
}
33693376

3377+
void ggml_backend_metal_get_device_properties(ggml_backend_t backend, struct ggml_metal_device_properties * properties) {
3378+
GGML_ASSERT(ggml_backend_is_metal(backend));
3379+
3380+
struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context;
3381+
3382+
strncpy(properties->name, [ctx->device name].UTF8String, sizeof(ctx->device.name));
3383+
properties->memory = ctx->device.recommendedMaxWorkingSetSize / 1073741824.0 ; // TODO is this what i want? mb? 1024*1024*1024 is bytes to gb
3384+
properties->gpu_family = ctx->family;
3385+
properties->gpu_family_common = ctx->family_common;
3386+
properties->metal_version = ctx->metal_version;
3387+
}
3388+
3389+
void ggml_backend_metal_get_device_memory_usage(ggml_backend_t backend, float * used, float * total) {
3390+
GGML_ASSERT(ggml_backend_is_metal(backend));
3391+
3392+
struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context;
3393+
3394+
*used = (float)ctx->device.currentAllocatedSize / 1024.0 / 1024.0;
3395+
*total = (float)ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0;
3396+
}
3397+
33703398
bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family) {
33713399
GGML_ASSERT(ggml_backend_is_metal(backend));
33723400

llama.cpp/llama.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17565,6 +17565,10 @@ enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx) {
1756517565
return ctx->cparams.pooling_type;
1756617566
}
1756717567

17568+
int32_t llama_model_quant_str(const struct llama_model * model, char * buf, size_t buf_size) {
17569+
return snprintf(buf, buf_size, "%s", llama_model_ftype_name(model->ftype).c_str());
17570+
}
17571+
1756817572
int32_t llama_n_vocab(const struct llama_model * model) {
1756917573
return model->hparams.n_vocab;
1757017574
}

llama.cpp/llama.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -478,6 +478,9 @@ extern "C" {
478478
// Get the model's RoPE frequency scaling factor
479479
LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
480480

481+
// Get the quantization of the model
482+
LLAMA_API int32_t llama_model_quant_str(const struct llama_model * model, char * buf, size_t buf_size);
483+
481484
// Functions to access the model's GGUF metadata scalar values
482485
// - The functions return the length of the string on success, or -1 on failure
483486
// - The output string is always null-terminated and cleared on failure

llama.cpp/main/BUILD.mk

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@ o/$(MODE)/llama.cpp/main/main: \
1313
o/$(MODE)/llama.cpp/main/embedding.o \
1414
o/$(MODE)/llamafile/server/server.a \
1515
o/$(MODE)/llama.cpp/server/server.a \
16+
o/$(MODE)/localscore/localscore.a \
17+
o/$(MODE)/third_party/mbedtls/mbedtls.a \
1618
o/$(MODE)/llama.cpp/llava/llava.a \
1719
o/$(MODE)/llama.cpp/llama.cpp.a \
1820
o/$(MODE)/llamafile/highlight/highlight.a \
@@ -21,6 +23,7 @@ o/$(MODE)/llama.cpp/main/main: \
2123
o/$(MODE)/llamafile/server/main.1.asc.zip.o \
2224
$(LLAMA_CPP_SERVER_ASSETS:%=o/$(MODE)/%.zip.o) \
2325
$(LLAMAFILE_SERVER_ASSETS:%=o/$(MODE)/%.zip.o) \
26+
$(THIRD_PARTY_MBEDTLS_A_CERTS:%=o/$(MODE)/%.zip.o) \
2427

2528
$(LLAMA_CPP_MAIN_OBJS): llama.cpp/main/BUILD.mk
2629

llama.cpp/main/main.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
#include "llama.cpp/ggml-metal.h"
2828
#include "llama.cpp/llava/llava.h"
2929
#include "llama.cpp/server/server.h"
30+
#include "localscore/localscore.h"
3031
#include "llamafile/server/prog.h"
3132

3233
static llama_context ** g_ctx;
@@ -152,6 +153,7 @@ enum Program {
152153
CHATBOT,
153154
EMBEDDING,
154155
LLAMAFILER,
156+
LOCALSCORE
155157
};
156158

157159
enum Program determine_program(char *argv[]) {
@@ -168,6 +170,8 @@ enum Program determine_program(char *argv[]) {
168170
prog = EMBEDDING;
169171
} else if (!strcmp(argv[i], "--v2")) {
170172
v2 = true;
173+
} else if (!strcmp(argv[i], "--localscore")) {
174+
prog = LOCALSCORE;
171175
}
172176
}
173177
if (prog == SERVER && v2) {
@@ -221,6 +225,10 @@ int main(int argc, char ** argv) {
221225
return embedding_cli(argc, argv);
222226
}
223227

228+
if (prog == LOCALSCORE) {
229+
return localscore_cli(argc, argv);
230+
}
231+
224232
launch_sigint_thread();
225233

226234
gpt_params params;

0 commit comments

Comments
 (0)